scRNA seq pipeline

Author

Greta Bordin

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(scran)
  library(scater)
  library(batchelor)
  library(scDblFinder)
  library(sctransform)
  library(muscat)
  library(SEtools)
  library(cowplot)
  library(BiocParallel)
  library(BiocNeighbors)
  library(ComplexHeatmap)
  library(igraph)
  library(sechm)
})
Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
TMB was built with Matrix version 1.4.1
Current Matrix version is 1.5.3
Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package

Preprocessing and Clustering

Workflow inspired by: https://bioconductor.org/books/release/OSCA/ and the materials from the course STA426 - UZH.

Loading the data

sce <- readRDS("week13.SCE.rds")
table(sce$sample_id, sce$group_id)
           
              WT  LPS
  LC016_WT  1730    0
  LC017_LPS    0 1027
  LC019_WT  1064    0
  LC020_LPS    0 1468
  LC022_WT  1806    0
  LC023_LPS    0 1546
  LC025_WT  1259    0
  LC026_LPS    0 2247
# object
sce
class: SingleCellExperiment 
dim: 27998 12147 
metadata(0):
assays(1): counts
rownames(27998): ENSMUSG00000051951.Xkr4 ENSMUSG00000089699.Gm1992 ...
  ENSMUSG00000096730.Vmn2r122 ENSMUSG00000095742.CAAA01147332.1
rowData names(2): ENSEMBL SYMBOL
colnames(12147): AAACCTGCAAAGCGGT-1.LC017 AAACCTGCACGGACAA-1.LC017 ...
  TTTGTCACATCTATGG-1.LC025 TTTGTCAGTCATATGC-1.LC025
colData names(3): sample_id barcode group_id
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
# dimensions
dim(sce)
[1] 27998 12147
# counts assay
counts(sce)[41:45,45:50]
5 x 6 sparse Matrix of class "dgCMatrix"
                                 AAGGAGCAGGAATTAC-1.LC017
ENSMUSG00000042501.Cpa6                                 .
ENSMUSG00000048960.Prex2                                2
ENSMUSG00000057715.A830018L16Rik                        .
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
                                 AAGGAGCGTAGGAGTC-1.LC017
ENSMUSG00000042501.Cpa6                                 .
ENSMUSG00000048960.Prex2                                .
ENSMUSG00000057715.A830018L16Rik                        2
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
                                 AAGGCAGTCTCTGAGA-1.LC017
ENSMUSG00000042501.Cpa6                                 .
ENSMUSG00000048960.Prex2                                .
ENSMUSG00000057715.A830018L16Rik                        1
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
                                 AAGGTTCAGCTAGTTC-1.LC017
ENSMUSG00000042501.Cpa6                                 .
ENSMUSG00000048960.Prex2                                .
ENSMUSG00000057715.A830018L16Rik                        2
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
                                 AAGGTTCAGCTCTCGG-1.LC017
ENSMUSG00000042501.Cpa6                                 1
ENSMUSG00000048960.Prex2                                .
ENSMUSG00000057715.A830018L16Rik                        2
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
                                 AAGTCTGGTGTTTGTG-1.LC017
ENSMUSG00000042501.Cpa6                                 1
ENSMUSG00000048960.Prex2                                .
ENSMUSG00000057715.A830018L16Rik                        2
ENSMUSG00000097171.Gm17644                              .
ENSMUSG00000101314.Gm29663                              .
# cells names and metadata
head(colnames(sce))
[1] "AAACCTGCAAAGCGGT-1.LC017" "AAACCTGCACGGACAA-1.LC017"
[3] "AAACCTGTCGCAAACT-1.LC017" "AAACGGGGTATTCGTG-1.LC017"
[5] "AAACGGGGTCGCGGTT-1.LC017" "AAACGGGGTGGTTTCA-1.LC017"
head(colData(sce))
DataFrame with 6 rows and 3 columns
                         sample_id            barcode group_id
                          <factor>        <character> <factor>
AAACCTGCAAAGCGGT-1.LC017 LC017_LPS AAACCTGCAAAGCGGT-1      LPS
AAACCTGCACGGACAA-1.LC017 LC017_LPS AAACCTGCACGGACAA-1      LPS
AAACCTGTCGCAAACT-1.LC017 LC017_LPS AAACCTGTCGCAAACT-1      LPS
AAACGGGGTATTCGTG-1.LC017 LC017_LPS AAACGGGGTATTCGTG-1      LPS
AAACGGGGTCGCGGTT-1.LC017 LC017_LPS AAACGGGGTCGCGGTT-1      LPS
AAACGGGGTGGTTTCA-1.LC017 LC017_LPS AAACGGGGTGGTTTCA-1      LPS
# genes names and metadata
head(rownames(sce))
[1] "ENSMUSG00000051951.Xkr4"    "ENSMUSG00000089699.Gm1992" 
[3] "ENSMUSG00000102343.Gm37381" "ENSMUSG00000025900.Rp1"    
[5] "ENSMUSG00000109048.Rp1"     "ENSMUSG00000025902.Sox17"  
head(rownames(sce))
[1] "ENSMUSG00000051951.Xkr4"    "ENSMUSG00000089699.Gm1992" 
[3] "ENSMUSG00000102343.Gm37381" "ENSMUSG00000025900.Rp1"    
[5] "ENSMUSG00000109048.Rp1"     "ENSMUSG00000025902.Sox17"  
head(rowData(sce))
DataFrame with 6 rows and 2 columns
                                      ENSEMBL      SYMBOL
                                  <character> <character>
ENSMUSG00000051951.Xkr4    ENSMUSG00000051951        Xkr4
ENSMUSG00000089699.Gm1992  ENSMUSG00000089699      Gm1992
ENSMUSG00000102343.Gm37381 ENSMUSG00000102343     Gm37381
ENSMUSG00000025900.Rp1     ENSMUSG00000025900         Rp1
ENSMUSG00000109048.Rp1     ENSMUSG00000109048         Rp1
ENSMUSG00000025902.Sox17   ENSMUSG00000025902       Sox17

QC

# mitochondrial genes:
mito <- grep("mt-", rownames(sce), value = TRUE)
# get QC metrics:
sce <- addPerCellQC(sce, subsets=list(Mt=mito), percent.top=c(5,10))
head(colData(sce))
DataFrame with 6 rows and 11 columns
                         sample_id            barcode group_id       sum
                          <factor>        <character> <factor> <numeric>
AAACCTGCAAAGCGGT-1.LC017 LC017_LPS AAACCTGCAAAGCGGT-1      LPS      1386
AAACCTGCACGGACAA-1.LC017 LC017_LPS AAACCTGCACGGACAA-1      LPS      1193
AAACCTGTCGCAAACT-1.LC017 LC017_LPS AAACCTGTCGCAAACT-1      LPS       738
AAACGGGGTATTCGTG-1.LC017 LC017_LPS AAACGGGGTATTCGTG-1      LPS       906
AAACGGGGTCGCGGTT-1.LC017 LC017_LPS AAACGGGGTCGCGGTT-1      LPS      1743
AAACGGGGTGGTTTCA-1.LC017 LC017_LPS AAACGGGGTGGTTTCA-1      LPS      1450
                          detected percent.top_5 percent.top_10 subsets_Mt_sum
                         <integer>     <numeric>      <numeric>      <numeric>
AAACCTGCAAAGCGGT-1.LC017       935      10.24531       13.70851             22
AAACCTGCACGGACAA-1.LC017       776      17.60268       20.78793              0
AAACCTGTCGCAAACT-1.LC017       538      15.71816       18.83469              5
AAACGGGGTATTCGTG-1.LC017       683       6.62252        9.60265             29
AAACGGGGTCGCGGTT-1.LC017      1103      13.88411       16.86747              5
AAACGGGGTGGTTTCA-1.LC017      1073       9.03448       10.96552             12
                         subsets_Mt_detected subsets_Mt_percent     total
                                   <integer>          <numeric> <numeric>
AAACCTGCAAAGCGGT-1.LC017                   7           1.587302      1386
AAACCTGCACGGACAA-1.LC017                   0           0.000000      1193
AAACCTGTCGCAAACT-1.LC017                   4           0.677507       738
AAACGGGGTATTCGTG-1.LC017                   8           3.200883       906
AAACGGGGTCGCGGTT-1.LC017                   5           0.286862      1743
AAACGGGGTGGTTTCA-1.LC017                   6           0.827586      1450
sce <- addPerFeatureQC(sce)
head(rowData(sce))
DataFrame with 6 rows and 4 columns
                                      ENSEMBL      SYMBOL       mean  detected
                                  <character> <character>  <numeric> <numeric>
ENSMUSG00000051951.Xkr4    ENSMUSG00000051951        Xkr4 1.54317939 50.012349
ENSMUSG00000089699.Gm1992  ENSMUSG00000089699      Gm1992 0.25693587 18.202025
ENSMUSG00000102343.Gm37381 ENSMUSG00000102343     Gm37381 0.00115255  0.115255
ENSMUSG00000025900.Rp1     ENSMUSG00000025900         Rp1 0.00650366  0.633901
ENSMUSG00000109048.Rp1     ENSMUSG00000109048         Rp1 0.00000000  0.000000
ENSMUSG00000025902.Sox17   ENSMUSG00000025902       Sox17 0.00773854  0.609204
# plotting some of the metrics
qc <- as.data.frame(colData(sce))
ggplot(qc, aes(subsets_Mt_percent)) + geom_histogram() + facet_wrap(~sample_id)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(qc, aes(log10(sum))) + geom_histogram() + facet_wrap(~sample_id)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(qc, aes(log10(sum), log10(detected))) + geom_point() + geom_density2d()

# Set thresholds on library sizes and detection rate:
sce$qc.out <- isOutlier(log(sce$sum),nmads=3,batch=sce$sample_id) |
              isOutlier(log(sce$detected),nmads=3,batch=sce$sample_id)

table(sce$qc.out)

FALSE  TRUE 
12115    32 
# flag doublets:
gc()
           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
Ncells 11349446 606.2   17705992 945.7         NA 17705992 945.7
Vcells 46164769 352.3   93715150 715.0      16384 90520856 690.7
sce <- scDblFinder(sce, samples="sample_id")
table(sce$scDblFinder.class)

singlet doublet 
  11447     700 

Normalization, filtering, reduction

# getting rid of genes that were rarely identified
sce <- sce[rowData(sce)$detected>=4,]
dim(sce)
[1]  8887 12147
# standard log-normalization
#sce <- logNormCounts(sce)
# variance-stabilizing transformation
vst <- suppressWarnings(sctransform::vst(counts(sce)))
Calculating cell attributes from input UMI matrix: log_umi
Variance stabilizing transformation of count matrix of size 8887 by 12147
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 12147 cells

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
Found 115 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 8887 genes

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |======================================================================| 100%
Calculating gene attributes
Wall clock passed: Time difference of 2.832951 mins
logcounts(sce) <- vst$y

# get highly-variable genes
hvg <- row.names(sce)[order(vst$gene_attr$residual_variance, decreasing=TRUE)[1:2000]]
# run PCA
sce <- runPCA(sce, subset_row=hvg)

# check the variance explained by the PCs:
pc.var <- attr(reducedDim(sce),"percentVar")
plot(pc.var, xlab="PCs", ylab="% variance explained")

# restrict to the first 20 components:
reducedDim(sce) <- reducedDim(sce)[,1:20]

# run and plot 2d projections based on the PCA:
sce <- runTSNE(sce, dimred="PCA")
sce <- runUMAP(sce, dimred="PCA", n_neighbors=30)
# plot by doublet score
plot_grid(plotUMAP(sce, colour_by="scDblFinder.score"), 
          plotTSNE(sce, colour_by="scDblFinder.score"))

# filter out low quality cells
sce <- sce[,sce$scDblFinder.class!="doublet" & !sce$qc.out]

# check mixing:
plot_grid(plotTSNE(sce[, sample(1:ncol(sce), ncol(sce))], colour_by="group_id"), 
          plotTSNE(sce[, sample(1:ncol(sce), ncol(sce))], colour_by="sample_id"))

Batch correction

# batch corrected - mutual nearest neighbors
sce2 <- fastMNN(sce, batch=sce$sample_id, BNPARAM=AnnoyParam())
# corrected PCA
reducedDim(sce, type="PCA") <- reducedDim(sce2)[,1:20]
# recompute the 2d projections
sce <- runTSNE(sce, dimred="PCA")
sce <- runUMAP(sce, dimred="PCA", n_neighbors=25)
plot_grid(plotTSNE(sce[, sample(1:ncol(sce), ncol(sce))], colour_by="group_id"), 
          plotTSNE(sce[, sample(1:ncol(sce), ncol(sce))], colour_by="sample_id"))

Clustering

g <- buildKNNGraph(sce, BNPARAM=AnnoyParam(), use.dimred="PCA", k=30)
sce$cluster <- as.factor(cluster_louvain(g)$membership)

# lower resolution
g <- buildKNNGraph(sce, BNPARAM=AnnoyParam(), use.dimred="PCA", k=300)
sce$cluster2 <- as.factor(cluster_louvain(g)$membership)

saveRDS(sce, file="week13.SCE.processed.rds")

plot_grid(plotTSNE(sce, colour_by="cluster", text_by="cluster"),
           plotTSNE(sce, colour_by="cluster2", text_by="cluster2"))

Cluster abundances across samples

ca <- table(cluster=sce$cluster, sample=sce$sample_id)
ggplot(as.data.frame(ca), aes(sample, cluster, fill=Freq)) + geom_tile() + geom_text(aes(label=Freq))

Cluster annotation

De-novo markers identification

# identify genes that differ between clusters:
mm <- findMarkers(sce, groups=sce$cluster, test.type="binom")
# select the top 5 markers by cluster:
markers <- unique(unlist(lapply(mm, FUN=function(x){
  head(row.names(x[x$FDR<0.01,]),5)
})))

Known markers

genes <- list(
  astrocytes = c("Aqp4", "Gfap", "Fgfr3","Dio2"),
  endothelial = c("Cldn5","Nostrin","Flt1"),
  microglia = c("C1qb","Tyrobp","P2ry12", "Csf1r", "Irf8"),
  neuronal = c("Snap25", "Stmn2", "Syn1", "Rbfox3"),
  excNeuron = c("Slc17a7","Camk2a","Grin2b","Fezf2"),
  inhNeuron = c("Gad1","Lhx6","Adarb2"),
  oligodendro = c("Opalin","Plp1","Mag","Mog"),
  OPC = c("Pdgfra","Sox6","Bcan")
)

# find the matching row names for each gene:
km <- lapply(genes, FUN=function(g) grep(paste0(g, "$", collapse="|"), rownames(sce), value=TRUE))
km
$astrocytes
[1] "ENSMUSG00000054252.Fgfr3" "ENSMUSG00000020932.Gfap" 
[3] "ENSMUSG00000024411.Aqp4" 

$endothelial
[1] "ENSMUSG00000029648.Flt1"  "ENSMUSG00000041378.Cldn5"

$microglia
[1] "ENSMUSG00000036353.P2ry12" "ENSMUSG00000036905.C1qb"  

$neuronal
[1] "ENSMUSG00000027273.Snap25" "ENSMUSG00000037217.Syn1"  
[3] "ENSMUSG00000027500.Stmn2"  "ENSMUSG00000025576.Rbfox3"

$excNeuron
[1] "ENSMUSG00000030209.Grin2b"  "ENSMUSG00000070570.Slc17a7"
[3] "ENSMUSG00000021743.Fezf2"   "ENSMUSG00000024617.Camk2a" 

$inhNeuron
[1] "ENSMUSG00000070880.Gad1"   "ENSMUSG00000052551.Adarb2"

$oligodendro
[1] "ENSMUSG00000031425.Plp1"   "ENSMUSG00000036634.Mag"   
[3] "ENSMUSG00000076439.Mog"    "ENSMUSG00000050121.Opalin"

$OPC
[1] "ENSMUSG00000004892.Bcan"   "ENSMUSG00000029231.Pdgfra"
[3] "ENSMUSG00000051910.Sox6"  

Pseudo-bulk aggregation

# mean logcounts by cluster:
pb <- aggregateData(sce, "logcounts", by=c("cluster"), fun="mean")

assayNames(pb) <- "logcounts"
rowData(pb)$marker4 <- NA
rowData(pb)[unlist(km),"marker4"] <- rep(names(km),lengths(km))
sechm::sechm(pb, unlist(km), assayName = "logcounts", gaps_row = "marker4", show_colnames = TRUE, do.scale = TRUE, breaks=1, row_title_rot=0)

# heatmap of the mean logcounts of the known markers:
h1 <- pheatmap(assay(pb)[unlist(km),], annotation_row=data.frame(row.names=unlist(km), type=rep(names(km), lengths(km))), split=rep(names(km), lengths(km)), cluster_rows=FALSE, scale="row")
# heatmap for the de-novo markers:
h2 <- pheatmap(assay(pb)[markers,], scale="row")
# Assign clusters to the cell type whose markers are the most expressed

# get rid of the unspecific neuronal markers:
km <- km[names(km)!="neuronal"]
# extract the pseudo-bulk counts of the markers for each cluster
mat <- assay(pb)[unlist(km),]
# aggregate across markers of the same type
mat <- aggregate(t(scale(t(mat))), by=list(type=rep(names(km), lengths(km))), FUN=sum)
# for each column (cluster), select the row (cell type) which has the maximum aggregated value
cl2 <- mat[,1][apply(mat[,-1], 2, FUN=which.max)]
# convert the cells' cluster labels to cell type labels:
sce$cluster2 <- cl2[sce$cluster]

# Aggregate to pseudo-bulk using the new clusters
pb <- aggregateData(sce, "logcounts", by=c("cluster2"), fun="mean")
# Plot the expression of the markers as a sanity check
h1 <- pheatmap(assay(pb)[unlist(km),], annotation_row=data.frame(row.names=unlist(km), type=rep(names(km), lengths(km))), split=rep(names(km), lengths(km)), cluster_rows=FALSE, scale="row")
plotTSNE(sce, colour_by="cluster2", text_by="cluster2")

Differential state analysis

# Aggregate by cluster x sample to perform pseudo-bulk differential state analysis
sce <- muscat::prepSCE(sce, kid="cluster2")
pb <- aggregateData(sce)
assays(pb)
List of length 7
names(7): astrocytes endothelial excNeuron inhNeuron microglia oligodendro OPC
# MDS plot
pbMDS(pb)